{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "%matplotlib inline"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n# Feature transformations with ensembles of trees\n\n\nTransform your features into a higher dimensional, sparse space. Then\ntrain a linear model on these features.\n\nFirst fit an ensemble of trees (totally random trees, a random\nforest, or gradient boosted trees) on the training set. Then each leaf\nof each tree in the ensemble is assigned a fixed arbitrary feature\nindex in a new feature space. These leaf indices are then encoded in a\none-hot fashion.\n\nEach sample goes through the decisions of each tree of the ensemble\nand ends up in one leaf per tree. The sample is encoded by setting\nfeature values for these leaves to 1 and the other feature values to 0.\n\nThe resulting transformer has then learned a supervised, sparse,\nhigh-dimensional categorical embedding of the data.\n\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "# Author: Tim Head <betatim@gmail.com>\n#\n# License: BSD 3 clause\n\nimport numpy as np\nnp.random.seed(10)\n\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import make_classification\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,\n                              GradientBoostingClassifier)\nfrom sklearn.preprocessing import OneHotEncoder\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import roc_curve\nfrom sklearn.pipeline import make_pipeline\n\nn_estimator = 10\nX, y = make_classification(n_samples=80000)\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)\n\n# It is important to train the ensemble of trees on a different subset\n# of the training data than the linear regression model to avoid\n# overfitting, in particular if the total number of leaves is\n# similar to the number of training samples\nX_train, X_train_lr, y_train, y_train_lr = train_test_split(\n    X_train, y_train, test_size=0.5)\n\n# Unsupervised transformation based on totally random trees\nrt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator,\n                          random_state=0)\n\nrt_lm = LogisticRegression(max_iter=1000)\npipeline = make_pipeline(rt, rt_lm)\npipeline.fit(X_train, y_train)\ny_pred_rt = pipeline.predict_proba(X_test)[:, 1]\nfpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)\n\n# Supervised transformation based on random forests\nrf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)\nrf_enc = OneHotEncoder()\nrf_lm = LogisticRegression(max_iter=1000)\nrf.fit(X_train, y_train)\nrf_enc.fit(rf.apply(X_train))\nrf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)\n\ny_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1]\nfpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)\n\n# Supervised transformation based on gradient boosted trees\ngrd = GradientBoostingClassifier(n_estimators=n_estimator)\ngrd_enc = OneHotEncoder()\ngrd_lm = LogisticRegression(max_iter=1000)\ngrd.fit(X_train, y_train)\ngrd_enc.fit(grd.apply(X_train)[:, :, 0])\ngrd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)\n\ny_pred_grd_lm = grd_lm.predict_proba(\n    grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1]\nfpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)\n\n# The gradient boosted model by itself\ny_pred_grd = grd.predict_proba(X_test)[:, 1]\nfpr_grd, tpr_grd, _ = roc_curve(y_test, y_pred_grd)\n\n# The random forest model by itself\ny_pred_rf = rf.predict_proba(X_test)[:, 1]\nfpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)\n\nplt.figure(1)\nplt.plot([0, 1], [0, 1], 'k--')\nplt.plot(fpr_rt_lm, tpr_rt_lm, label='RT + LR')\nplt.plot(fpr_rf, tpr_rf, label='RF')\nplt.plot(fpr_rf_lm, tpr_rf_lm, label='RF + LR')\nplt.plot(fpr_grd, tpr_grd, label='GBT')\nplt.plot(fpr_grd_lm, tpr_grd_lm, label='GBT + LR')\nplt.xlabel('False positive rate')\nplt.ylabel('True positive rate')\nplt.title('ROC curve')\nplt.legend(loc='best')\nplt.show()\n\nplt.figure(2)\nplt.xlim(0, 0.2)\nplt.ylim(0.8, 1)\nplt.plot([0, 1], [0, 1], 'k--')\nplt.plot(fpr_rt_lm, tpr_rt_lm, label='RT + LR')\nplt.plot(fpr_rf, tpr_rf, label='RF')\nplt.plot(fpr_rf_lm, tpr_rf_lm, label='RF + LR')\nplt.plot(fpr_grd, tpr_grd, label='GBT')\nplt.plot(fpr_grd_lm, tpr_grd_lm, label='GBT + LR')\nplt.xlabel('False positive rate')\nplt.ylabel('True positive rate')\nplt.title('ROC curve (zoomed in at top left)')\nplt.legend(loc='best')\nplt.show()"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.9"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}